Traitement
Traitement
Packages et fonctions externes
type_col = readRDS("./../data/data_sel_type_col.rds")
data_sel = fread(file = "./../data/data_sel.csv", colClasses = type_col) %>% as.data.frame()Certaines variables qualitatives ont des modalités très peu représentées (fréquence relative inférieure à 5%). Il est nécessaire de fusionner certaines modalités pour combler ce problème. Il serait inquiétant de laisser les algorithmes apprendre sur une poignées d’observations pour comprendre l’interaction d’une modalité avec son environnement (autres observations, autres modalités, autres variables, target).
Deux stratégies sont adoptées ici pour fusionner des modalités :
- Projetter les modalités sur le premier plan factoriel d’une AFDM et regrouper les modalités projetées dans la même direction (clusters).
- Fusionner les modalités en fonction de leur taux de target positive : les modalités ayant un comportement similaire vis-à-vis de la target seront rassemblées si besoin.
Fusion des modalités
AFDM
df<- data_sel %>% filter(dataset=="train")
if (0) {
res_famd <- FAMD(df[-c(1:3)], ncp = 2, graph = FALSE)
saveRDS(object = res_famd,
file = "./../data/res_famd.rds")
}res_famd<-readRDS("./../data/res_famd.rds")
coord<-res_famd$quali.var$coord %>% as.data.frame()
liste_var_factor = type_col[type_col == "factor"]
liste_var_factor<-names(liste_var_factor[-1])
name<-c()
for (var in liste_var_factor) {
df<-data_sel %>% filter(dataset=="train") %>% select(var) %>% droplevels() # fix gt
colnames(df)<-"col"
n<-length(levels(df$col))
name<-c(name,rep(var,n))
}
# name <-name[-c(239,302)]
# On retire une de chaque a cause de modalite non presente dans train (ps_calc_06 et ps_calc_11)
# fix gt : au lieu de retirer à la main, il y a la fonction droplevels() pour ce problème
coord<-cbind.data.frame(name,coord)
modalite<-data.frame("modalite"=rownames(coord), stringsAsFactors = FALSE) %>%
mutate(nb=str_count(modalite, pattern = "\\."), clean = case_when(nb == 0 ~ gsub("X","",modalite),
nb == 1 ~ str_extract(modalite, regex("(?<=X)[0-9]*")),
nb == 2 ~ str_extract(modalite, regex("(?<=X)[0-9]*\\.[0-9]*"))))
coord<-cbind.data.frame(coord,"modalite"=modalite$clean)
coord$modalite<-as.character(coord$modalite)
coord$modalite[which(name=="ps_reg_01")]<-seq(0,0.9,0.1)
coord$modalite[which(name=="ps_reg_02")]<-seq(0,1.8,0.1)
for(var in liste_var_factor) {
df<- coord %>% filter(name==var) %>% select(-name)
print(gg_proj_ind(df,1,2) + geom_text_repel(aes(label=modalite)) +ggtitle(var))
}data_afdm = data_sel
levels(data_afdm$ps_ind_05_cat) = c("A", "B", "A", "B", "B", "A", "C") # on laisse la modalité "6" seule car très atypique et a presque 5% ...
levels(data_afdm$ps_car_01_cat) = c("A", "B", "B", "B", "C", "B", "C", "B", "A", "D", "A", "D")
# ps_car_11_cat laissée de côté pour faire une régresssion logistique
levels(data_afdm$ps_car_06_cat) = c("A", "B", "C", "D", "E", "F", "E", "D", "C", "F", "D", "G", "D", "C", "H", "C", "F", "F")
levels(data_afdm$ps_car_09_cat) = c("A", "B", "C", "C", "D") # on laisse la modalité "1" seule car atypique et a presque 5% ...
levels(data_afdm$ps_ind_02_cat) = c("A", "B", "C", "C")
levels(data_afdm$ps_car_04_cat) = c("A", "B", "B", "B", "B", "B", "C", "C", "C", "C")Tables croisées avec la target
for (var in names(liste_var_factor)[2:length(liste_var_factor)]){
print(gg_contingence_freq(get("var")))
}data_tabc = data_sel
levels(data_tabc$ps_ind_05_cat) = c("A", "B", "C", "B", "C", "B", "C")
levels(data_tabc$ps_car_01_cat) = c("A", "A", "A", "B", "B", "B", "C", "D", "E", "A", "F", "E")
# ps_car_11_cat laissée de côté pour faire une régresssion logistique
levels(data_tabc$ps_car_06_cat) = c("A", "B", "C", "C", "C", "C", "D", "D", "E", "E", "F", "G", "H", "H", "H", "E", "E", "E")
levels(data_tabc$ps_car_09_cat) = c("A", "B", "C", "C", "C")
levels(data_tabc$ps_ind_02_cat) = c("A", "B", "C", "C")
levels(data_tabc$ps_car_04_cat) = c("A", "B", "B", "B", "B", "C", "C", "C", "C", "C")Remarque : régression logistique sur ps_car_11_cat, on récupère les beta qui serviront de “modalités” pour construire une variable continue (faire un join ensuite pour remplacer les modalités par les beta).
Régression logistique sur variable à 104 modalités
df <- data_afdm %>% filter(dataset=="train")
model<-glm(data = df, target ~ ps_car_11_cat, family = binomial(link = logit))
res<-summary(model)
df<-cbind.data.frame(ps_car_11_cat=gsub("ps_car_11_cat","",rownames(res$coefficients)),res$coefficients)
df<- df %>% select(ps_car_11_cat,Estimate)
df$ps_car_11_cat<-as.factor(df$ps_car_11_cat)
# imputation dans la base afdm
data_afdm<- left_join(data_afdm,df,"ps_car_11_cat")Warning: Column `ps_car_11_cat` joining factors with different levels,
coercing to character vector
data_afdm$ps_car_11_cat<-data_afdm$Estimate
data_afdm <- data_afdm %>% select(-Estimate)
data_afdm$ps_car_11_cat[is.na(data_afdm$ps_car_11_cat)] = 0 # NA signifie que la modalité était la modalité de référence, donc coef = 0
# imputation dans la base tabc
data_tabc<- left_join(data_tabc,df,"ps_car_11_cat")Warning: Column `ps_car_11_cat` joining factors with different levels,
coercing to character vector
data_tabc$ps_car_11_cat<-data_tabc$Estimate
data_tabc <- data_tabc %>% select(-Estimate)
data_tabc$ps_car_11_cat[is.na(data_tabc$ps_car_11_cat)] = 0 # NA signifie que la modalité était la modalité de référence, donc coef = 0
# représentation pour voir comment se distribuent les valeurs
ggplot(data_afdm, aes(ps_car_11_cat,color=target)) +
geom_density() +ggtitle("ps_car_11_cat")Après avoir fusionné les modalités selon deux stratégies différentes, on peut entraîner un LGBM pour voir l’impact de ces traitements sur la prédictivité du modèle.
Benchmark - LGBM (sur données AFDM)
# LGBM
train_X_lgb = data_afdm %>%
filter(dataset == "train") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
train_Y_lgb = data_afdm %>%
filter(dataset == "train") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix() - 1
train_Y_num = as.numeric(data_afdm$target[data_afdm$dataset == "train"]) - 1
test_X_lgb = data_afdm %>%
filter(dataset == "test") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
test_Y_lgb = data_afdm %>%
filter(dataset == "test") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix()
test_Y_num = as.numeric(data_afdm$target[data_afdm$dataset == "test"]) - 1
train_XY_lgb = lgb.Dataset(train_X_lgb, label = train_Y_lgb)set.seed(1234)
params_lgb = list(
objective = "binary", # type of exercise
metric = "auc", # metric to be evaluated
learning_rate = 0.01, # shrinkage rate
max_depth = 10, # max depth for tree model (used to deal with over-fitting when data is small)
num_leaves = 20, # max number of leaves (nodes) in one tree
is_unbalance = T, # is data unbalanced
min_data_in_leaf = 1, # min number of data in one leaf (used to deal with over-fitting)
feature_fraction = 0.8, # randomly select part of the features on each iteration
bagging_fraction = 0.8, # randomly select part of the data without resampling
bagging_freq = 5, # if != 0, enables bagging, performs bagging at every k iteration
num_threads = 6 # number of cpu cores (not threads) to use
)
post_fusion_afdm_lgb_cv = lgb.cv(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = 1000, # maximum iterations
early_stopping_rounds = 50, # if metric evaluation doesn't increase
verbose = 1, # enable verbose
eval_freq = 50, # verbose every n iterations
nfold = 5 # k-folds CV
)
post_fusion_afdm_lgb_model <- lgb.train(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
valids = list(train = train_XY_lgb), # lgb.Dataset object for validation
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = post_fusion_afdm_lgb_cv$best_iter, # nrounds from CV
verbose = 1, # enable verbose
eval_freq = 50 # verbose every n iterations
)# save(post_fusion_afdm_lgb_cv, file = "./../data/post_fusion_afdm_lgb_cv.RData")
# lgb.save(post_fusion_afdm_lgb_model, filename = "./../data/post_fusion_afdm_lgb_model.txt")
load("./../data/post_fusion_afdm_lgb_cv.RData")
post_fusion_afdm_lgb_model = lgb.load("./../data/post_fusion_afdm_lgb_model.txt")post_fusion_afdm_lgb_train_preds = predict(post_fusion_afdm_lgb_model, train_X_lgb)
post_fusion_afdm_lgb_test_preds = predict(post_fusion_afdm_lgb_model, test_X_lgb)
post_fusion_afdm_lgb_train_ngini = normalizedGini(train_Y_num, post_fusion_afdm_lgb_train_preds)
post_fusion_afdm_lgb_test_ngini = normalizedGini(test_Y_num, post_fusion_afdm_lgb_test_preds)
ngini_valid = post_fusion_afdm_lgb_cv[["record_evals"]][["valid"]][["Norm-gini"]][["eval"]] %>% unlist %>% max
c("Normalized Gini Coeff. (Train)" = post_fusion_afdm_lgb_train_ngini,
"Normalized Gini Coeff. (Valid - 5fCV)" = ngini_valid,
"Normalized Gini Coeff. (Test)" = post_fusion_afdm_lgb_test_ngini) Normalized Gini Coeff. (Train)
0.3491641
Normalized Gini Coeff. (Valid - 5fCV)
0.2728841
Normalized Gini Coeff. (Test)
0.2827741
Benchmark - LGBM (sur données TABC)
# LGBM
train_X_lgb = data_tabc %>%
filter(dataset == "train") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
train_Y_lgb = data_tabc %>%
filter(dataset == "train") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix() - 1
train_Y_num = as.numeric(data_tabc$target[data_tabc$dataset == "train"]) - 1
test_X_lgb = data_tabc %>%
filter(dataset == "test") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
test_Y_lgb = data_tabc %>%
filter(dataset == "test") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix()
test_Y_num = as.numeric(data_tabc$target[data_tabc$dataset == "test"]) - 1
train_XY_lgb = lgb.Dataset(train_X_lgb, label = train_Y_lgb)set.seed(1234)
params_lgb = list(
objective = "binary", # type of exercise
metric = "auc", # metric to be evaluated
learning_rate = 0.01, # shrinkage rate
max_depth = 10, # max depth for tree model (used to deal with over-fitting when data is small)
num_leaves = 20, # max number of leaves (nodes) in one tree
is_unbalance = T, # is data unbalanced
min_data_in_leaf = 1, # min number of data in one leaf (used to deal with over-fitting)
feature_fraction = 0.8, # randomly select part of the features on each iteration
bagging_fraction = 0.8, # randomly select part of the data without resampling
bagging_freq = 5, # if != 0, enables bagging, performs bagging at every k iteration
num_threads = 6 # number of cpu cores (not threads) to use
)
post_fusion_tabc_lgb_cv = lgb.cv(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = 1000, # maximum iterations
early_stopping_rounds = 50, # if metric evaluation doesn't increase
verbose = 1, # enable verbose
eval_freq = 50, # verbose every n iterations
nfold = 5 # k-folds CV
)
post_fusion_tabc_lgb_model <- lgb.train(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
valids = list(train = train_XY_lgb), # lgb.Dataset object for validation
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = post_fusion_tabc_lgb_cv$best_iter, # nrounds from CV
verbose = 1, # enable verbose
eval_freq = 50 # verbose every n iterations
)# save(post_fusion_tabc_lgb_cv, file = "./../data/post_fusion_tabc_lgb_cv.RData")
# lgb.save(post_fusion_tabc_lgb_model, filename = "./../data/post_fusion_tabc_lgb_model.txt")
load("./../data/post_fusion_tabc_lgb_cv.RData")
post_fusion_tabc_lgb_model = lgb.load("./../data/post_fusion_tabc_lgb_model.txt")post_fusion_tabc_lgb_train_preds = predict(post_fusion_tabc_lgb_model, train_X_lgb)
post_fusion_tabc_lgb_test_preds = predict(post_fusion_tabc_lgb_model, test_X_lgb)
post_fusion_tabc_lgb_train_ngini = normalizedGini(train_Y_num, post_fusion_tabc_lgb_train_preds)
post_fusion_tabc_lgb_test_ngini = normalizedGini(test_Y_num, post_fusion_tabc_lgb_test_preds)
ngini_valid = post_fusion_tabc_lgb_cv[["record_evals"]][["valid"]][["Norm-gini"]][["eval"]] %>% unlist %>% max
c("Normalized Gini Coeff. (Train)" = post_fusion_tabc_lgb_train_ngini,
"Normalized Gini Coeff. (Valid - 5fCV)" = ngini_valid,
"Normalized Gini Coeff. (Test)" = post_fusion_tabc_lgb_test_ngini) Normalized Gini Coeff. (Train)
0.3531964
Normalized Gini Coeff. (Valid - 5fCV)
0.2756634
Normalized Gini Coeff. (Test)
0.2847164
# On nettoie tout pour poursuivre le traitement et refaire un benchmark !
rm(list = ls()[grep("_lgb", ls())])
gc() used (Mb) gc trigger (Mb) max used (Mb)
Ncells 3199621 170.9 5943202 317.5 4664114 249.1
Vcells 168182274 1283.2 282796496 2157.6 282631105 2156.4
Les fusions par AFDM ont fortement diminué le coefficient de Gini. Par conséquent, on garde uniquement les fusions par tableaux de contingence, bien qu’elles diminuent légèrement le score. Ces fusions sont nécessaires et le coût (perte du score) reste raisonnable.
Exportation
Après avoir transformé les deux échantillons, on peut les rediviser selon la colonne “dataset”. On exportera deux bases dans ce cas (ne pas oublier de récupérer le type des colonnes et exporter ça également).
D’autres transformations sont envisagées, si celles-ci ne sont pas obligatoires mais recommandées. En revanche, si elles font diminuer le score, elles ne seront pas retenues. On travaillera alors avec les bases actuelles par la suite.
train <- data_tabc %>% filter(dataset=="train") %>% select(-dataset)
test <- data_tabc %>% filter(dataset=="test") %>% select(-dataset)
type_col = sapply(train, class)
saveRDS(object = type_col,
file = "./../data/type_col.rds")
write.csv(x = train,
file = "./../data/train.csv",
quote = T,
row.names = F)
write.csv(x = test,
file = "./../data/test.csv",
quote = T,
row.names = F)Transformation des variables continues
[1] "id" "ps_car_13" "ps_reg_03" "ps_ind_03" "ps_reg_01"
[6] "ps_ind_15" "ps_car_14" "ps_car_15" "ps_ind_01" "ps_reg_02"
[11] "ps_car_12" "ps_calc_14" "ps_calc_10" "ps_calc_03" "ps_calc_02"
[16] "ps_calc_11" "ps_car_11"
# boxcox retenu
data_afdm = data_afdm %>%
mutate(ps_car_13 = (boxcox_transformation(ps_car_13, ps_car_13_checkup$lambda) - ps_car_13_checkup$boxcox_mean)/ps_car_13_checkup$boxcox_std)
data_tabc = data_tabc %>%
mutate(ps_car_13 = (boxcox_transformation(ps_car_13, ps_car_13_checkup$lambda) - ps_car_13_checkup$boxcox_mean)/ps_car_13_checkup$boxcox_std)# boxcox retenu
data_afdm = data_afdm %>%
mutate(ps_reg_03 = (boxcox_transformation(ps_reg_03, ps_reg_03_checkup$lambda) - ps_reg_03_checkup$boxcox_mean)/ps_reg_03_checkup$boxcox_std)
data_tabc = data_tabc %>%
mutate(ps_reg_03 = (boxcox_transformation(ps_reg_03, ps_reg_03_checkup$lambda) - ps_reg_03_checkup$boxcox_mean)/ps_reg_03_checkup$boxcox_std)# ps_car_12_checkup = gg_transformation_checkup(data_sel, "ps_car_12")
#
# # log retenu
# data_afdm = data_afdm %>%
# mutate(ps_car_12 = (log(ps_car_12) - ps_car_12_checkup$log_mean)/ps_car_12_checkup$log_std)
# data_tabc = data_tabc %>%
# mutate(ps_car_12 = (log(ps_car_12) - ps_car_12_checkup$log_mean)/ps_car_12_checkup$log_std)# log retenu
data_afdm = data_afdm %>%
mutate(ps_car_14 = (log(ps_car_14) - ps_car_14_checkup$log_mean)/ps_car_14_checkup$log_std)
data_tabc = data_tabc %>%
mutate(ps_car_14 = (log(ps_car_14) - ps_car_14_checkup$log_mean)/ps_car_14_checkup$log_std)Benchmark - LGBM (sur données AFDM)
# LGBM
train_X_lgb = data_afdm %>%
filter(dataset == "train") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
train_Y_lgb = data_afdm %>%
filter(dataset == "train") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix() - 1
train_Y_num = as.numeric(data_afdm$target[data_afdm$dataset == "train"]) - 1
test_X_lgb = data_afdm %>%
filter(dataset == "test") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
test_Y_lgb = data_afdm %>%
filter(dataset == "test") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix()
test_Y_num = as.numeric(data_afdm$target[data_afdm$dataset == "test"]) - 1
train_XY_lgb = lgb.Dataset(train_X_lgb, label = train_Y_lgb)set.seed(1234)
params_lgb = list(
objective = "binary", # type of exercise
metric = "auc", # metric to be evaluated
learning_rate = 0.01, # shrinkage rate
max_depth = 10, # max depth for tree model (used to deal with over-fitting when data is small)
num_leaves = 20, # max number of leaves (nodes) in one tree
is_unbalance = T, # is data unbalanced
min_data_in_leaf = 1, # min number of data in one leaf (used to deal with over-fitting)
feature_fraction = 0.8, # randomly select part of the features on each iteration
bagging_fraction = 0.8, # randomly select part of the data without resampling
bagging_freq = 5, # if != 0, enables bagging, performs bagging at every k iteration
num_threads = 6 # number of cpu cores (not threads) to use
)
post_traite_afdm_lgb_cv = lgb.cv(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = 1000, # maximum iterations
early_stopping_rounds = 50, # if metric evaluation doesn't increase
verbose = 1, # enable verbose
eval_freq = 50, # verbose every n iterations
nfold = 5 # k-folds CV
)
post_traite_afdm_lgb_model <- lgb.train(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
valids = list(train = train_XY_lgb), # lgb.Dataset object for validation
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = post_traite_afdm_lgb_cv$best_iter, # nrounds from CV
verbose = 1, # enable verbose
eval_freq = 50 # verbose every n iterations
)# save(post_traite_afdm_lgb_cv, file = "./../data/post_traite_afdm_lgb_cv.RData")
# lgb.save(post_traite_afdm_lgb_model, filename = "./../data/post_traite_afdm_lgb_model.txt")
load("./../data/post_traite_afdm_lgb_cv.RData")
post_traite_afdm_lgb_model = lgb.load("./../data/post_traite_afdm_lgb_model.txt")post_traite_afdm_lgb_train_preds = predict(post_traite_afdm_lgb_model, train_X_lgb)
post_traite_afdm_lgb_test_preds = predict(post_traite_afdm_lgb_model, test_X_lgb)
post_traite_afdm_lgb_train_ngini = normalizedGini(train_Y_num, post_traite_afdm_lgb_train_preds)
post_traite_afdm_lgb_test_ngini = normalizedGini(test_Y_num, post_traite_afdm_lgb_test_preds)
ngini_valid = post_traite_afdm_lgb_cv[["record_evals"]][["valid"]][["Norm-gini"]][["eval"]] %>% unlist %>% max
c("Normalized Gini Coeff. (Train)" = post_traite_afdm_lgb_train_ngini,
"Normalized Gini Coeff. (Valid - 5fCV)" = ngini_valid,
"Normalized Gini Coeff. (Test)" = post_traite_afdm_lgb_test_ngini) Normalized Gini Coeff. (Train)
0.3489638
Normalized Gini Coeff. (Valid - 5fCV)
0.2728511
Normalized Gini Coeff. (Test)
0.2826310
Benchmark - LGBM (sur données TABC)
# LGBM
train_X_lgb = data_tabc %>%
filter(dataset == "train") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
train_Y_lgb = data_tabc %>%
filter(dataset == "train") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix() - 1
train_Y_num = as.numeric(data_tabc$target[data_tabc$dataset == "train"]) - 1
test_X_lgb = data_tabc %>%
filter(dataset == "test") %>%
select(-id, -target, -dataset) %>%
lgb.prepare() %>%
as.matrix()
test_Y_lgb = data_tabc %>%
filter(dataset == "test") %>%
select(target) %>%
lgb.prepare() %>%
as.matrix()
test_Y_num = as.numeric(data_tabc$target[data_tabc$dataset == "test"]) - 1
train_XY_lgb = lgb.Dataset(train_X_lgb, label = train_Y_lgb)set.seed(1234)
params_lgb = list(
objective = "binary", # type of exercise
metric = "auc", # metric to be evaluated
learning_rate = 0.01, # shrinkage rate
max_depth = 10, # max depth for tree model (used to deal with over-fitting when data is small)
num_leaves = 20, # max number of leaves (nodes) in one tree
is_unbalance = T, # is data unbalanced
min_data_in_leaf = 1, # min number of data in one leaf (used to deal with over-fitting)
feature_fraction = 0.8, # randomly select part of the features on each iteration
bagging_fraction = 0.8, # randomly select part of the data without resampling
bagging_freq = 5, # if != 0, enables bagging, performs bagging at every k iteration
num_threads = 6 # number of cpu cores (not threads) to use
)
post_traite_tabc_lgb_cv = lgb.cv(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = 1000, # maximum iterations
early_stopping_rounds = 50, # if metric evaluation doesn't increase
verbose = 1, # enable verbose
eval_freq = 50, # verbose every n iterations
nfold = 5 # k-folds CV
)
post_traite_tabc_lgb_model <- lgb.train(
params = params_lgb, # hyperparameters
data = train_XY_lgb, # lgb.Dataset object for training
valids = list(train = train_XY_lgb), # lgb.Dataset object for validation
eval = lgb.normalizedgini, # custom metric, additionnal to first metric
nrounds = post_traite_tabc_lgb_cv$best_iter, # nrounds from CV
verbose = 1, # enable verbose
eval_freq = 50 # verbose every n iterations
)# save(post_traite_tabc_lgb_cv, file = "./../data/post_traite_tabc_lgb_cv.RData")
# lgb.save(post_traite_tabc_lgb_model, filename = "./../data/post_traite_tabc_lgb_model.txt")
load("./../data/post_traite_tabc_lgb_cv.RData")
post_traite_tabc_lgb_model = lgb.load("./../data/post_traite_tabc_lgb_model.txt")post_traite_tabc_lgb_train_preds = predict(post_traite_tabc_lgb_model, train_X_lgb)
post_traite_tabc_lgb_test_preds = predict(post_traite_tabc_lgb_model, test_X_lgb)
post_traite_tabc_lgb_train_ngini = normalizedGini(train_Y_num, post_traite_tabc_lgb_train_preds)
post_traite_tabc_lgb_test_ngini = normalizedGini(test_Y_num, post_traite_tabc_lgb_test_preds)
ngini_valid = post_traite_tabc_lgb_cv[["record_evals"]][["valid"]][["Norm-gini"]][["eval"]] %>% unlist %>% max
c("Normalized Gini Coeff. (Train)" = post_traite_tabc_lgb_train_ngini,
"Normalized Gini Coeff. (Valid - 5fCV)" = ngini_valid,
"Normalized Gini Coeff. (Test)" = post_traite_tabc_lgb_test_ngini) Normalized Gini Coeff. (Train)
0.3419099
Normalized Gini Coeff. (Valid - 5fCV)
0.2751949
Normalized Gini Coeff. (Test)
0.2840637
Les scores ont diminué pour les deux bases, de façon significative pour des transformations (normalisation de 4 variables continues). On laisse tomber ces transformations pour rester dans l’optique d’optimisation. En revanche, on garde les fusions de modalités des variables qualitatives par tableaux de contingence parce qu’elles sont nécessaires bien que le score diminue légèrement.